package spider.utils.html;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import spider.utils.http.HttpGetConnect;


/**
 * @说明：
 * @author: gaoll
 * @CreateTime:2014-11-13
 * @ModifyTime:2014-11-13
 */
public class HtmlManage {
	
//	public static void main(String[] args) throws IOException{
//		// TODO Auto-generated method stub
//		HttpGetConnect get = new HttpGetConnect();
//		Document doc = manage(get.connect("http://www.oschina.net/tweets","UTF-8"));
//		
//		Elements elems = doc.getElementsByAttributeValueMatching("link", Pattern.compile("http://my.oschina.net/(.)*/[0-9]*"));
//		log.info(elems.size());
//	}
	
	/**
	 * 解析HTML
	 * @param html
	 * @return
	 */
	public static Document manage(String html){
		Document doc = Jsoup.parse(html);
		return doc;
	}
	/**
	 * 直接用jsoup获取网页并解析
	 * @param url
	 * @return
	 * @throws IOException
	 */
	public Document manageDirect(String url) throws IOException{
		Document doc = Jsoup.connect( url ).get();
		return doc;
	}

	/**
	 * 直接找两个关键句之间的文字
	 * @param html
	 * @param start
	 * @param end
	 * @return
	 */
	public String manageSplit(String html,String start,String end){
		return html.substring(html.indexOf(start) + start.length() , html.indexOf(end));
	}
	
	/**
	 * 根据标签获取标签下内容
	 * @param doc
	 * @param tag
	 * @return
	 */
	public List<String> manageHtmlTag(Document doc,String tag ){
		List<String> list = new ArrayList<String>();
		Elements elements = doc.getElementsByTag(tag);
		for(int i = 0; i < elements.size() ; i++){
			String str = elements.get(i).html();
			list.add(str);
		}
		return list;
	}
	
	/**
	 * 根据Class获取对应标签下内容
	 * @param doc
	 * @param clas
	 * @return
	 */
	public List<String> manageHtmlClass(Document doc,String clas ){
		List<String> list = new ArrayList<String>();
		
		Elements elements = doc.getElementsByClass(clas);
		for(int i = 0; i < elements.size() ; i++){
			String str = elements.get(i).html();
			list.add(str);
		}
		return list;
	}
	
	/**
	 * 根据标签中的属性和属性值获取标签下内容
	 * @param doc
	 * @param key
	 * @param value
	 * @return
	 */
	public List<String> manageHtmlKey(Document doc,String key,String value ){
		List<String> list = new ArrayList<String>();
		
		Elements elements = doc.getElementsByAttributeValueMatching("link", Pattern.compile(value));
		for(int i = 0; i < elements.size() ; i++){
			String str = elements.get(i).html();
			list.add(str);
		}
		return list;
	}
	
	
	private static Log log = LogFactory.getLog(HtmlManage.class);
}
